suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(tidytext))Drive BC Network Analysis
Data Loading and Initial Cleaning
PLEASE NOTE: The data used in our repo is too big to upload to GitHub. We decided that it was best we .gitignore our data and have it on our local machines.
The data we used can be accessed through: Acccess data here my analysis on this page used the years 2018 to 2023 and I renamed the files to the format drivebceventshistXXXX where XXXX represents the year of the csv.
data_files <- list.files(path = "../data/",
pattern = "drivebceventshist.*\\.csv",
full.names = TRUE)
data_files <- data_files[grep("(2018|2019|2020|2021|2022|2023)", data_files)]
col_types <- cols(
EVENT_TYPE = col_character(),
EVENT_SUBTYPE = col_character(),
SEVERITY = col_character(),
CREATED = col_character(),
AREA_NAME = col_character(),
HEAD_LATITUDE = col_double(),
HEAD_LONGITUDE = col_double(),
ROAD_NAME = col_character()
)
# Read and combine with consistent types
road_data <- bind_rows(
lapply(data_files, function(file) {
read_csv(file, col_types = col_types) %>%
select(
EVENT_TYPE,
EVENT_SUBTYPE,
SEVERITY,
CREATED,
AREA_NAME,
HEAD_LATITUDE,
HEAD_LONGITUDE,
ROAD_NAME
) %>%
mutate(CREATED = as.POSIXct(CREATED)) # Convert to POSIXct for date-time operations
})
)head(road_data)nrow(road_data)[1] 1005385
Bipartite network for incidents and time factors
# Function for the creation of a bipartite network of events and time factors
create_incident_time_network <- function(data_sample) {
incident_time_edges <- data_sample %>%
mutate(
hour_created = hour(as.POSIXct(CREATED)),
time_of_day = case_when(
hour_created >= 6 & hour_created < 12 ~ "Morning",
hour_created >= 12 & hour_created < 18 ~ "Afternoon",
hour_created >= 18 & hour_created < 24 ~ "Evening",
hour_created >= 0 & hour_created < 6 ~ "Night"
),
day_of_week = weekdays(CREATED),
season = case_when(
month(CREATED) %in% c(12, 1, 2) ~ "Winter",
month(CREATED) %in% c(3, 4, 5) ~ "Spring",
month(CREATED) %in% c(6, 7, 8) ~ "Summer",
TRUE ~ "Fall"
)
) %>%
select(EVENT_TYPE, SEVERITY, AREA_NAME, time_of_day, day_of_week, season) %>%
pivot_longer(cols = c(time_of_day, day_of_week, season), names_to = "time_factor", values_to = "time_value") %>%
select(EVENT_TYPE, time_value)
incident_time_nodes <- data.frame(
name = unique(c(incident_time_edges$EVENT_TYPE, incident_time_edges$time_value)),
type = c(rep(TRUE, length(unique(incident_time_edges$EVENT_TYPE))),
rep(FALSE, length(unique(incident_time_edges$time_value))))
)
incident_time_graph <- graph_from_data_frame(d = incident_time_edges,
vertices = incident_time_nodes,
directed = FALSE)
return(incident_time_graph)
}
# Stratified sampling so we get a proportionate amount of events
set.seed(123) # For reproducibility of the sample
road_data_sample <- road_data %>%
group_by(EVENT_TYPE) %>%
sample_frac(0.001) %>% # .001 is about 1000 sample size for our data
ungroup()
incident_time_network <- create_incident_time_network(road_data_sample)
layout <- layout_as_bipartite(incident_time_network)
plot(
incident_time_network,
layout = layout,
vertex.label.cex = 0.6,
vertex.size = 5,
vertex.label.dist = 1.5,
main = "Bipartite Network of Incidents and Time (stratified sample n=1000)"
)The sample_frac function is used to take a stratified sample from the road_data dataframe, ensuring that each EVENT_TYPE is proportionally represented.
in_degrees <- degree(incident_time_network, mode = "in")
degree_data <- data.frame(
node = V(incident_time_network)$name,
in_degree = in_degrees
)
# Filter to include only time values
time_values <- degree_data %>%
filter(grepl("^[A-Z][a-z]", node))
time_values <- time_values %>%
arrange(desc(in_degree))
ggplot(time_values, aes(x = reorder(node, in_degree), y = in_degree)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "All Event Types In-Degree Distribution (2018-2023)", x = "Node", y = "In-Degree") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() Create a plot for each event type, using the entire dataset (2018-2023)
event_types <- unique(road_data$EVENT_TYPE)
years <- 2018:2023
# Combined data frame for all event types and years
combined_data <- do.call(rbind, lapply(event_types, function(event_type) {
do.call(rbind, lapply(years, function(year) {
event_data <- road_data %>%
filter(EVENT_TYPE == event_type, year(CREATED) == year)
event_network <- create_incident_time_network(event_data)
event_degree_data <- data.frame(
node = V(event_network)$name,
in_degree = degree(event_network, mode = "in"),
event_type = event_type,
year = year
)
event_degree_data %>%
filter(grepl("^[A-Z][a-z]", node))
}))
}))
combined_data <- combined_data %>%
group_by(event_type, node) %>%
mutate(node_total = sum(in_degree)) %>%
ungroup() %>%
group_by(event_type) %>%
mutate(total_degree = sum(in_degree)) %>%
ungroup()
ggplot(combined_data,
aes(x = reorder_within(node, desc(node_total), event_type),
y = in_degree,
fill = factor(year))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "In-Degree Distribution by Event Type and Year",
x = "Node",
y = "In-Degree",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap(~ reorder(event_type, desc(total_degree)),
scales = "free_x",
nrow = 2,
ncol = 3) +
scale_x_reordered()Region event distribution plot
region_events <- road_data %>%
group_by(AREA_NAME, EVENT_TYPE) %>%
summarise(count = n(), .groups = 'drop') %>%
group_by(AREA_NAME) %>%
mutate(total_events = sum(count)) %>%
ungroup()
ggplot(region_events,
aes(x = count,
y = reorder(AREA_NAME, total_events),
fill = EVENT_TYPE)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Event Distribution by Region (2018-2023)",
x = "Number of Events",
y = "Region",
fill = "Event Type") +
theme_minimal() +
theme(
axis.text.y = element_text(size = 8),
legend.position = "right",
plot.title = element_text(hjust = 0.5)
)
region_eventsRelationship between maintenance/construction and incidents
# filter for highways only
road_network <- road_data %>%
filter(grepl("^(Hwy|Highway|Trans-Canada|BC-)", ROAD_NAME)) %>%
mutate(
event_category = case_when(
EVENT_TYPE %in% c("CONSTRUCTION", "MAINTENANCE") ~ "maintenance",
EVENT_TYPE %in% c("INCIDENT", "COLLISION") ~ "incident",
TRUE ~ "other"
)
) %>%
# Count events by road and category
group_by(ROAD_NAME, event_category) %>%
summarise(
count = n(),
.groups = 'drop'
) %>%
# Pivot to wide format to get maintenance and incident counts per road
pivot_wider(
names_from = event_category,
values_from = count,
values_fill = 0
)
# Create adjacency matrix based on similar incident patterns
road_matrix <- road_network %>%
select(maintenance, incident) %>%
dist() %>%
as.matrix()
g <- graph_from_adjacency_matrix(
(road_matrix < mean(road_matrix)) * 1,
mode = "undirected",
weighted = TRUE
)
g <- simplify(g, remove.loops = TRUE) # need to remove self-loops as they are not meaningful
V(g)$name <- road_network$ROAD_NAME
V(g)$maintenance <- road_network$maintenance
V(g)$incidents <- road_network$incident
V(g)$ratio <- road_network$incident / (road_network$maintenance + 1)
correlation <- cor.test(road_network$maintenance, road_network$incident)
layout <- layout_on_grid(g)
incident_sizes <- log1p(V(g)$incidents)
norm_sizes <- 3 + (incident_sizes - min(incident_sizes)) /
(max(incident_sizes) - min(incident_sizes)) * 15
plot(g,
layout = layout,
vertex.size = norm_sizes,
vertex.color = ifelse(V(g)$ratio > median(V(g)$ratio),
adjustcolor("red", alpha=0.7),
adjustcolor("green", alpha=0.7)),
vertex.label.cex = 0.7,
vertex.label.dist = 0.8,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = paste("Road Network: Maintenance vs Incidents\n",
"Correlation =", round(correlation$estimate, 3)))
legend("bottomright",
legend=c("High incident/maintenance ratio", "Low incident/maintenance ratio"),
fill=c(adjustcolor("red", alpha=0.7), adjustcolor("green", alpha=0.7)),
cex=0.6,
bty="n")
# statistical summary
summary_stats <- road_network %>%
summarise(
correlation = cor(maintenance, incident),
roads_with_high_maintenance = sum(maintenance > mean(maintenance)),
roads_with_high_incidents = sum(incident > mean(incident))
)
summary_stats# Ordered list of roads and weights
road_weights <- data.frame(
road = V(g)$name,
incidents = V(g)$incidents,
maintenance = V(g)$maintenance,
ratio = V(g)$ratio
) %>%
arrange(desc(ratio)) %>%
mutate(
formatted_output = sprintf(
"%s: %.2f incidents per maintenance (incidents: %d, maintenance: %d)",
road,
ratio,
incidents,
maintenance
)
)
cat("\nRoads ordered by incident/maintenance ratio:\n")
Roads ordered by incident/maintenance ratio:
cat(paste0(seq_along(road_weights$formatted_output), ". ",
road_weights$formatted_output,
collapse = "\n"))1. Highway 77: 29.83 incidents per maintenance (incidents: 179, maintenance: 5)
2. Highway 103: 14.89 incidents per maintenance (incidents: 268, maintenance: 17)
3. Highway 51: 8.88 incidents per maintenance (incidents: 213, maintenance: 23)
4. Highway 97D: 7.45 incidents per maintenance (incidents: 447, maintenance: 59)
5. Highway 52E: 7.00 incidents per maintenance (incidents: 7, maintenance: 0)
6. Highway 8: 6.06 incidents per maintenance (incidents: 1607, maintenance: 264)
7. Highway 31A: 5.38 incidents per maintenance (incidents: 210, maintenance: 38)
8. Highway 40: 5.00 incidents per maintenance (incidents: 15, maintenance: 2)
9. Highway 7B: 4.51 incidents per maintenance (incidents: 659, maintenance: 145)
10. Highway 41: 4.38 incidents per maintenance (incidents: 35, maintenance: 7)
11. Highway 91A: 4.36 incidents per maintenance (incidents: 659, maintenance: 150)
12. Highway 5A: 4.16 incidents per maintenance (incidents: 2427, maintenance: 582)
13. Highway 113: 4.00 incidents per maintenance (incidents: 4, maintenance: 0)
14. Highway 91: 3.91 incidents per maintenance (incidents: 4708, maintenance: 1204)
15. Highway 101: 2.70 incidents per maintenance (incidents: 426, maintenance: 157)
16. Highway 26: 2.23 incidents per maintenance (incidents: 820, maintenance: 367)
17. Highway 12: 2.02 incidents per maintenance (incidents: 736, maintenance: 363)
18. Highway 7A: 2.00 incidents per maintenance (incidents: 2, maintenance: 0)
19. Highway 22A: 1.91 incidents per maintenance (incidents: 42, maintenance: 21)
20. Highway 13: 1.72 incidents per maintenance (incidents: 136, maintenance: 78)
21. Highway 31: 1.69 incidents per maintenance (incidents: 446, maintenance: 263)
22. Highway 6 EW: 1.68 incidents per maintenance (incidents: 1906, maintenance: 1135)
23. Highway 17A: 1.62 incidents per maintenance (incidents: 263, maintenance: 161)
24. Highway 6 NS: 1.52 incidents per maintenance (incidents: 448, maintenance: 293)
25. Highway 3A: 1.52 incidents per maintenance (incidents: 2147, maintenance: 1416)
26. Highway 23: 1.48 incidents per maintenance (incidents: 2616, maintenance: 1764)
27. Highway 99: 1.47 incidents per maintenance (incidents: 10583, maintenance: 7202)
28. Highway 49: 1.45 incidents per maintenance (incidents: 530, maintenance: 364)
29. Highway 11: 1.42 incidents per maintenance (incidents: 664, maintenance: 468)
30. Highway 1: 1.41 incidents per maintenance (incidents: 39207, maintenance: 27824)
31. Highway 17 SFPR: 1.29 incidents per maintenance (incidents: 89, maintenance: 68)
32. Highway 97A: 1.27 incidents per maintenance (incidents: 1251, maintenance: 984)
33. Highway 6: 1.20 incidents per maintenance (incidents: 12, maintenance: 9)
34. Highway 93: 1.19 incidents per maintenance (incidents: 1096, maintenance: 923)
35. Highway 18: 1.17 incidents per maintenance (incidents: 132, maintenance: 112)
36. Highway 5: 1.07 incidents per maintenance (incidents: 6887, maintenance: 6412)
37. Highway 395: 1.07 incidents per maintenance (incidents: 30, maintenance: 27)
38. Highway 59: 1.07 incidents per maintenance (incidents: 15, maintenance: 13)
39. Highway 97C: 1.04 incidents per maintenance (incidents: 1197, maintenance: 1149)
40. Highway 4A: 1.03 incidents per maintenance (incidents: 72, maintenance: 69)
41. Highway 22: 1.02 incidents per maintenance (incidents: 126, maintenance: 122)
42. Highway 2: 1.02 incidents per maintenance (incidents: 583, maintenance: 573)
43. Highway 3: 0.91 incidents per maintenance (incidents: 6662, maintenance: 7305)
44. Highway 97: 0.89 incidents per maintenance (incidents: 13606, maintenance: 15373)
45. Highway 33: 0.88 incidents per maintenance (incidents: 592, maintenance: 670)
46. Highway 7: 0.85 incidents per maintenance (incidents: 2145, maintenance: 2522)
47. Highway 17: 0.83 incidents per maintenance (incidents: 2085, maintenance: 2525)
48. Highway 15: 0.76 incidents per maintenance (incidents: 606, maintenance: 795)
49. Highway 37A: 0.75 incidents per maintenance (incidents: 810, maintenance: 1081)
50. Highway 20: 0.65 incidents per maintenance (incidents: 1060, maintenance: 1628)
51. Highway 10: 0.63 incidents per maintenance (incidents: 545, maintenance: 865)
52. Highway 14: 0.62 incidents per maintenance (incidents: 1086, maintenance: 1758)
53. Highway 37: 0.60 incidents per maintenance (incidents: 1957, maintenance: 3282)
54. Highway 29: 0.59 incidents per maintenance (incidents: 847, maintenance: 1430)
55. Highway 24: 0.58 incidents per maintenance (incidents: 316, maintenance: 543)
56. Highway 1A: 0.57 incidents per maintenance (incidents: 124, maintenance: 218)
57. Highway 30: 0.56 incidents per maintenance (incidents: 132, maintenance: 235)
58. Highway 3B: 0.54 incidents per maintenance (incidents: 90, maintenance: 165)
59. Highway 95: 0.47 incidents per maintenance (incidents: 485, maintenance: 1039)
60. Highway 43: 0.47 incidents per maintenance (incidents: 69, maintenance: 147)
61. Highway 4: 0.45 incidents per maintenance (incidents: 900, maintenance: 2015)
62. Highway 52 E: 0.42 incidents per maintenance (incidents: 150, maintenance: 357)
63. Highway 28: 0.42 incidents per maintenance (incidents: 333, maintenance: 800)
64. Highway 16: 0.38 incidents per maintenance (incidents: 4110, maintenance: 10808)
65. Highway 39: 0.38 incidents per maintenance (incidents: 45, maintenance: 119)
66. Highway 97B: 0.36 incidents per maintenance (incidents: 82, maintenance: 229)
67. Highway 52 N: 0.35 incidents per maintenance (incidents: 87, maintenance: 245)
68. Highway 19: 0.34 incidents per maintenance (incidents: 1185, maintenance: 3532)
69. Highway 21: 0.33 incidents per maintenance (incidents: 45, maintenance: 137)
70. Highway 19A: 0.32 incidents per maintenance (incidents: 494, maintenance: 1559)
71. Highway 95A: 0.23 incidents per maintenance (incidents: 90, maintenance: 394)
72. Highway 62: 0.20 incidents per maintenance (incidents: 7, maintenance: 34)
73. Highway 9: 0.17 incidents per maintenance (incidents: 104, maintenance: 623)
74. Highway 27: 0.15 incidents per maintenance (incidents: 69, maintenance: 458)
75. Highway 35: 0.06 incidents per maintenance (incidents: 26, maintenance: 413)
76. Highway 118: 0.06 incidents per maintenance (incidents: 16, maintenance: 265)
77. Highway 99A: 0.04 incidents per maintenance (incidents: 2, maintenance: 55)
The high correlation suggests that maintenance work is reactive - roads with more incidents tend to receive more maintenance, rather than maintenance preventing incidents.
- Problem roads get more maintenance attention
- High-traffic roads naturally have both more incidents and require more maintenance
- Maintenance work itself could potentially increase the likelihood of incidents
Community detection based on maintenance and incident patterns
communities <- cluster_louvain(g)
V(g)$community <- membership(communities)
n_communities <- length(unique(V(g)$community))
community_colors <- rainbow(n_communities, alpha=0.6)
par(mfrow=c(1,2), mar=c(2,2,3,2))
layout_grid <- layout_on_grid(g)
plot(g,
layout = layout_grid,
vertex.size = norm_sizes,
vertex.color = community_colors[V(g)$community],
vertex.label.cex = 0.6,
vertex.label.dist = 1.2,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = "Grid Layout: BC Highway Communities
based on maintenance and incident patterns")
layout_fr <- layout_with_fr(g, niter=500, grid="nogrid")
plot(g,
layout = layout_fr,
vertex.size = norm_sizes,
vertex.color = community_colors[V(g)$community],
vertex.label.cex = 0.6,
vertex.label.dist = 1.2,
vertex.label.color = "black",
vertex.frame.color = "gray50",
edge.color = adjustcolor("gray40", alpha=0.2),
edge.width = 0.5,
main = "Fruchterman-Reingold Layout: BC Highway Communities
based on maintenance and incident patterns")
par(mfrow=c(1,1), mar=c(5,4,4,2)+0.1)
community_sizes <- table(membership(communities))
cluster_stats <- data.frame(
community = as.numeric(names(community_sizes)),
cluster_size = as.numeric(community_sizes),
avg_incidents = tapply(V(g)$incidents, V(g)$community, mean),
avg_maintenance = tapply(V(g)$maintenance, V(g)$community, mean)
)
cluster_stats <- cluster_stats[order(-cluster_stats$cluster_size), ]
cat("\nCluster Summary:\n")
Cluster Summary:
for(i in 1:nrow(cluster_stats)) {
community_id <- cluster_stats$community[i]
cat(sprintf("\nCluster %d (size: %d):\n",
community_id,
cluster_stats$cluster_size[i]))
cat("Members:", paste(V(g)$name[V(g)$community == community_id], collapse=", "), "\n")
cat(sprintf("Average incidents: %.1f\n", cluster_stats$avg_incidents[i]))
cat(sprintf("Average maintenance: %.1f\n", cluster_stats$avg_maintenance[i]))
}
Cluster 2 (size: 37):
Members: Highway 10, Highway 101, Highway 11, Highway 118, Highway 12, Highway 14, Highway 15, Highway 17A, Highway 19, Highway 19A, Highway 1A, Highway 2, Highway 20, Highway 24, Highway 26, Highway 27, Highway 28, Highway 29, Highway 30, Highway 31, Highway 33, Highway 35, Highway 37, Highway 37A, Highway 3B, Highway 4, Highway 49, Highway 52 E, Highway 52 N, Highway 6 NS, Highway 7B, Highway 9, Highway 91A, Highway 95, Highway 95A, Highway 97B, Highway 97D
Average incidents: 509.9
Average maintenance: 755.7
Cluster 3 (size: 23):
Members: Highway 103, Highway 113, Highway 13, Highway 17 SFPR, Highway 18, Highway 21, Highway 22, Highway 22A, Highway 31A, Highway 39, Highway 395, Highway 40, Highway 41, Highway 43, Highway 4A, Highway 51, Highway 52E, Highway 59, Highway 6, Highway 62, Highway 77, Highway 7A, Highway 99A
Average incidents: 76.3
Average maintenance: 48.0
Cluster 5 (size: 11):
Members: Highway 17, Highway 23, Highway 3A, Highway 5A, Highway 6 EW, Highway 7, Highway 8, Highway 91, Highway 93, Highway 97A, Highway 97C
Average incidents: 2107.7
Average maintenance: 1315.3
Cluster 6 (size: 2):
Members: Highway 3, Highway 5
Average incidents: 6774.5
Average maintenance: 6858.5
Cluster 1 (size: 1):
Members: Highway 1
Average incidents: 39207.0
Average maintenance: 27824.0
Cluster 4 (size: 1):
Members: Highway 16
Average incidents: 4110.0
Average maintenance: 10808.0
Cluster 7 (size: 1):
Members: Highway 97
Average incidents: 13606.0
Average maintenance: 15373.0
Cluster 8 (size: 1):
Members: Highway 99
Average incidents: 10583.0
Average maintenance: 7202.0